import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import sklearn
import seaborn as sns
import os
import re
import plotly.express as px
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dropout, Dense, Embedding, LSTM, SpatialDropout1D

from sklearn.model_selection import train_test_split
from sklearn.cluster import MiniBatchKMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import nltk
from nltk.corpus import stopwords 
from nltk import word_tokenize

from pyLDAvis import sklearn as sklearn_lda
import pickle 
import pyLDAvis

import warnings
warnings.filterwarnings('ignore')
Using TensorFlow backend.
qc_questions = pd.read_csv('QC-ARC-Questions.tsv',delimiter='\t',encoding='utf-8')
qc_taxonomy = pd.read_csv('QC-ARC-Taxonomy.tsv', delimiter='\t', encoding='utf-8')
qc_questions.head()
questionID originalQuestionID totalPossiblePoint AnswerKey isMultipleChoiceQuestion includesDiagram examName grade year QCLabel Question subject category fold
0 Mercury_SC_405198 405198 1 D 1 0 Mercury 3 2015 LIFE_FUNCT_ENVCOND_PLANT Which of these will most likely increase a pla... NaN Train Challenge
1 Mercury_SC_415013 415013 1 B 1 0 Mercury 3 2015 EARTH_INNER_INTHEAT, EARTH_INNER_PLATE_VOLC Which rapid changes are caused by heat from in... NaN Train Easy
2 Mercury_SC_405298 405298 1 C 1 0 Mercury 3 2015 OTHER Which invention will best help people travel q... NaN Train Easy
3 VASoL_2007_3_6 6 1 C 1 0 Virginia Standards of Learning - Science 3 2007 EARTH_HUMIMP_WHAT_HABITATDEST A wetland habitat can continue to support the ... NaN Train Challenge
4 Mercury_SC_416525 416525 1 A 1 0 Mercury 3 2015 LIFE_FUNCT_FEATANDFUNCT_PLANT_LEAF Randall is picking lettuce from his garden for... NaN Train Easy
qc_questions['QCLabel'].value_counts()
CEL_CYCLES                                                                                    146
MAT_CHEM_ATOMIC                                                                               114
MAT_CHEM_PERIODICTAB                                                                          100
SCI_INFERENCE_EXPDESIGN                                                                        86
MAT_CHANGES_CHEMICAL                                                                           81
                                                                                             ... 
EARTH_WEATHER_WATERCYC_EVAPCOND, EARTH_WEATHER_WATERCYC_PREC                                    1
ENG_THERM_COND_CONVECT, EARTH_INNER_CMC_MANTLE                                                  1
LIFE_FUNCT_FEATANDFUNCT_CELLBIO_STRUCT_CELLWALL, LIFE_FUNCT_FEATANDFUNCT_CELLBIO_PLANTCELL      1
ENG_DEVICES, ENG_CONV_INEFFIC, ENG_FORMS_HEAT                                                   1
FOR_PRESSURE, EARTH_OUTER_HYDRO                                                                 1
Name: QCLabel, Length: 1329, dtype: int64
qc_questions.shape[0]
7787

Exploratory Data Analysis

For this stage, I will be trying out a variety of visualization techniques on the K12 QC ARC question banks based on their categories and the frequencies of similar questions across different subjects.

Techniques to Include:

  • WordCloud
  • TF-IDF Vectorizer for Feature Extraction
  • Clustering Using MiniBatch K-Means

WordCloud

To perform a WordCloud visualization on the questions, we will be running all the questions through the following data pipeline:

  • tokenization
  • remove stop words
  • tagging
  • keeping only nouns
  • apply port stemmer and remove unit-length word
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
stop_words = set(stopwords.words('english')) 
[nltk_data] Downloading package punkt to /Users/paulshao/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/paulshao/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/paulshao/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
def port_stem(s):
    p_stemmer = nltk.PorterStemmer()
    return [p_stemmer.stem(word) for word in s] 
tokenized_questions = qc_questions['Question'].str.lower().apply(nltk.word_tokenize)
tokenized_questions = tokenized_questions.apply(lambda tokens: [w for w in tokens if not w in stop_words])
tokenized_q_pos = tokenized_questions.apply(nltk.pos_tag)
tokenized_nouns = tokenized_q_pos.apply(lambda tags: [word for word,pos in tags if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS')])
tokenized_nouns = tokenized_nouns.apply(port_stem).apply(lambda nouns: [noun for noun in nouns if len(noun) > 1])
retokenized = tokenized_nouns.apply(lambda tokens: ' '.join(tokens))
qtext = ''
for token in retokenized:
    qtext += token + ' '
wordcloud = WordCloud(background_color="white").generate(qtext)
plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
category = qc_questions['QCLabel'].apply(lambda x: x.split('_')[0]).apply(lambda x: 'LIFE' if x == 'OTHER, LIFE' else x)
qc_questions['Subject'] = category
plt.title('Distribution of Subjects Per Exam Question')
sns.countplot(x='Subject', data=qc_questions, palette="Set3", order = qc_questions['Subject'].value_counts().index);

TF-IDF Vectorization and MiniBatch K-Means

As an initial next step, to visualization how each exam question is phrased and the simillarity/difference among them across different subjects, we will compute the TF-IDF values as part of our feature extraction process and attempt to run an initial MiniBatch K-means clustering algorithm on the data.

As a first step, we will obtain the feature matrix on all the questions using the TfidfVectorizer.

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
nltk.download('stopwords')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower() 
    text = REPLACE_BY_SPACE_RE.sub(' ', text) 
    text = BAD_SYMBOLS_RE.sub('', text) 
    text = text.replace('x', '')
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    return text
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/paulshao/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
qc_questions['Q_cleaned'] = qc_questions['Question'].apply(clean_text).str.replace('\d+', '')
vec = TfidfVectorizer(stop_words="english")
vec.fit(qc_questions['Q_cleaned'].values)
features = vec.transform(qc_questions['Q_cleaned'].values)

In order to determine how many clusters we would like our K-means algorithm to aim for, we can start with a random guess and gradually increase the number of clusters.

cls_s = []
for i in range(5, 15):
    cls = MiniBatchKMeans(n_clusters=i, random_state=0)
    cls.fit(features)
    cls_s.append(cls)
for cls in cls_s:
    cls.predict(features)

To visualize, we’ll plot the features in a 3D space. As we know the dimension of features that we obtained from TfIdfVectorizer could be quite large, we need to reduce the dimension before we can plot. For this, we’ll ues PCA to transform our high dimensional features into 3 dimensions.

Note: As we hover over individual data points (exam questions) in the 3D scatterplot below, we can see the corresponding subject for the question.

pca = PCA(n_components=3)
reduced_features = pca.fit_transform(features.toarray())
reduced_cluster_centers = pca.transform(cls.cluster_centers_)

5 Clusters

import plotly.graph_objects as go
fig = go.Figure(data=[
                      go.Scatter3d(x=reduced_features[:,0], y=reduced_features[:,1], z=reduced_features[:,2],
                                   mode='markers',text=qc_questions['Subject'], marker=dict(size=4,color=cls_s[0].predict(features),                
                                  colorscale='Rainbow',opacity=0.6)),
                      go.Scatter3d(x=reduced_cluster_centers[:,0], y=reduced_cluster_centers[:,1], z=reduced_cluster_centers[:,2],
                                   mode='markers',marker=dict(size=6, symbol='x', color='rgb(0, 0, 0)'))
                      ]
                )
fig.update_layout( 
    scene = dict(
                xaxis_title='PC1',
                yaxis_title='PC2',
                zaxis_title='PC3'
    ),
    title='K-Means Clustering of Exam Questions (5 Clusters)'
)
fig.show()

10 Clusters

fig = go.Figure(data=[
                      go.Scatter3d(x=reduced_features[:,0], y=reduced_features[:,1], z=reduced_features[:,2],
                                   mode='markers',text=qc_questions['Subject'], marker=dict(size=4,color=cls_s[5].predict(features),                
                                  colorscale='Rainbow',opacity=0.6)),
                      go.Scatter3d(x=reduced_cluster_centers[:,0], y=reduced_cluster_centers[:,1], z=reduced_cluster_centers[:,2],
                                   mode='markers',marker=dict(size=6, symbol='x', color='rgb(0, 0, 0)'))
                      ]
                )
fig.update_layout( 
    scene = dict(
                xaxis_title='PC1',
                yaxis_title='PC2',
                zaxis_title='PC3'
    ),
    title='K-Means Clustering of Exam Questions (10 Clusters)'
)
fig.show()

15 clusters

fig = go.Figure(data=[
                      go.Scatter3d(x=reduced_features[:,0], y=reduced_features[:,1], z=reduced_features[:,2],
                                   mode='markers', text=qc_questions['Subject'], marker=dict(size=4,color=cls_s[-1].predict(features),                
                                  colorscale='Rainbow',opacity=0.6)),
                      go.Scatter3d(x=reduced_cluster_centers[:,0], y=reduced_cluster_centers[:,1], z=reduced_cluster_centers[:,2],
                                   mode='markers',marker=dict(size=6, symbol='x', color='rgb(0, 0, 0)'))
                      ]
                )

fig.update_layout( 
    scene = dict(
                xaxis_title='PC1',
                yaxis_title='PC2',
                zaxis_title='PC3'
    ),
    title='K-Means Clustering of Exam Questions (15 Clusters)'
)
fig.show()

LDA (Latent Dirichlet Allocation)

from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(stop_words='english')
count_data = count_vectorizer.fit_transform(qc_questions['Question'])

def count_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([words[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
        
number_topics = 20
number_words = 10

lda = LDA(n_components=number_topics, n_jobs=-1)
lda.fit(count_data)

print("Topics found via LDA:")
count_topics(lda, count_vectorizer, number_words)
Topics found via LDA:

Topic #0:
increase likely population decrease species fish ecosystem area increases number

Topic #1:
plants plant food soil water animals organisms leaves tree grow

Topic #2:
number atom electrons protons neutrons graph best core table mass

Topic #3:
air waves sound weather cold warm high light heat low

Topic #4:
energy chemical reaction heat light process electrical mechanical new form

Topic #5:
theory best balloon ocean color evidence continents plants trait following

Topic #6:
earth sun moon axis gravity day rotation force gravitational solar

Topic #7:
ocean plate plates continental sea tectonic oceans earthquakes changes likely

Topic #8:
rock water rocks erosion likely weathering river sedimentary formed wind

Topic #9:
energy coal oil car natural resources heat wind solar used

Topic #10:
water liquid solid gas change salt sugar matter substance mixture

Topic #11:
cell best membrane plastic light electricity thermometer copper microscope used

Topic #12:
mineral organisms best following classified color organism likely different stars

Topic #13:
carbon dioxide earth atmosphere water oxygen surface cycle evaporation layer

Topic #14:
light stars distance fur star year galaxy universe unit color

Topic #15:
scientists people new scientific scientist use likely results paper research

Topic #16:
light sun winter star planet planets summer mars blue solar

Topic #17:
student mass ball different students experiment investigation weight best data

Topic #18:
cells body cell human blood organ systems digestive function circulatory

Topic #19:
water ice temperature nitrogen elements hydrogen oxygen cube used metals

Visualizing the LDA Topic Modeling Results

LDAvis_data_filepath = 'ldavis_prepared_'+str(number_topics)
LDAvis_prepared = sklearn_lda.prepare(lda, count_data, count_vectorizer)
pyLDAvis.save_html(LDAvis_prepared, 'ldavis_prepared_'+ str(number_topics) +'.html')
pyLDAvis.display(LDAvis_prepared)

LSTM Modeling

import re
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dropout
MAX_WORD_COUNT = 5000
MAX_SEQ_SIZE = 50
EMBEDDING_DIM = 50
EPOCHS = 10
BATCH_SIZE = 32
category = qc_questions['QCLabel'].apply(lambda x: x.split('_')[0]).apply(lambda x: 'LIFE' if x == 'OTHER, LIFE' else x)
qc_questions['Subject'] = category
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
nltk.download('stopwords')
STOPWORDS = set(stopwords.words('english'))
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
def clean_text(text):
    text = text.lower() 
    text = REPLACE_BY_SPACE_RE.sub(' ', text) 
    text = BAD_SYMBOLS_RE.sub('', text) 
    text = text.replace('x', '')
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    return text
qc_questions['Q_cleaned'] = qc_questions['Question'].apply(clean_text).str.replace('\d+', '')
tokenizer = Tokenizer(num_words=MAX_WORD_COUNT, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(qc_questions['Q_cleaned'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
Found 10719 unique tokens.
X = tokenizer.texts_to_sequences(qc_questions['Q_cleaned'].values)
X = pad_sequences(X, maxlen=MAX_SEQ_SIZE)
print('Shape of Design Matrix:', X.shape)
Shape of Design Matrix: (7787, 50)
y = pd.get_dummies(qc_questions['Subject']).values
print('Shape of Response Vector:', y.shape)
Shape of Response Vector: (7787, 9)
 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state = 42)
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)
(6618, 50) (6618, 9)
(1169, 50) (1169, 9)
model = Sequential()
model.add(Embedding(MAX_WORD_COUNT, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(LSTM(50))
model.add(Dense(9, activation='relu'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_8 (Embedding)      (None, 50, 50)            250000    
_________________________________________________________________
lstm_8 (LSTM)                (None, 50)                20200     
_________________________________________________________________
dense_8 (Dense)              (None, 9)                 459       
=================================================================
Total params: 270,659
Trainable params: 270,659
Non-trainable params: 0
_________________________________________________________________
None
history = model.fit(X_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_split=0.1)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/indexed_slices.py:434: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
Train on 5956 samples, validate on 662 samples
Epoch 1/10
5956/5956 [==============================] - 26s 4ms/step - loss: 2.3624 - accuracy: 0.5831 - val_loss: 1.9518 - val_accuracy: 0.7523
Epoch 2/10
5956/5956 [==============================] - 25s 4ms/step - loss: 1.5921 - accuracy: 0.7456 - val_loss: 2.3677 - val_accuracy: 0.2900
Epoch 3/10
5956/5956 [==============================] - 25s 4ms/step - loss: 1.1511 - accuracy: 0.6187 - val_loss: 1.2828 - val_accuracy: 0.7447
Epoch 4/10
5956/5956 [==============================] - 25s 4ms/step - loss: 0.6156 - accuracy: 0.8716 - val_loss: 1.5353 - val_accuracy: 0.7795
Epoch 5/10
5956/5956 [==============================] - 25s 4ms/step - loss: 0.4157 - accuracy: 0.9256 - val_loss: 2.0267 - val_accuracy: 0.7855
Epoch 6/10
5956/5956 [==============================] - 25s 4ms/step - loss: 0.3279 - accuracy: 0.9466 - val_loss: 2.1312 - val_accuracy: 0.7779
Epoch 7/10
5956/5956 [==============================] - 25s 4ms/step - loss: 0.6632 - accuracy: 0.9077 - val_loss: 2.1040 - val_accuracy: 0.7885
Epoch 8/10
5956/5956 [==============================] - 25s 4ms/step - loss: 0.3984 - accuracy: 0.9384 - val_loss: 2.2170 - val_accuracy: 0.7613
Epoch 9/10
5956/5956 [==============================] - 25s 4ms/step - loss: 0.4357 - accuracy: 0.9308 - val_loss: 2.0835 - val_accuracy: 0.7885
Epoch 10/10
5956/5956 [==============================] - 26s 4ms/step - loss: 0.2827 - accuracy: 0.9599 - val_loss: 2.2143 - val_accuracy: 0.7734
accr = model.evaluate(X_test,y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))
1169/1169 [==============================] - 0s 334us/step
Test set
  Loss: 2.287
  Accuracy: 0.759
plt.title('Loss')
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show()
plt.title('Accuracy')
plt.plot(history.history['accuracy'], label='train')
plt.plot(history.history['val_accuracy'], label='test')
plt.legend()
plt.show();